Purpose

This document summarizes Rick Gilmore’s analysis of the Jaccard index data.

The goal is to explore ways to generate an empirical “null” distribution of the Jaccard index data to compare it to the observed data.

Set-up

Note: I have set eval=FALSE for a series of chunks below that generate the permuted sorting and Jaccard index data. These take many minutes to run.

Analysis plan

  1. Start with each participant’s raw sorting data within each wallpaper group.
  2. Permute the exemplars each participant sorted into similar piles by randomizing the mapping between the actual exemplar and the permuted exemplar id.
  1. Recalculate the Jaccard indices using the analysis/make.jaccard.df.R function.
  2. Do 1-3 n times, where n is large, probably 1,000.
  3. Compare the mean Jaccard indices (by wallpaper group) from the permuted set to the values we observed empirically.

Preliminary work

Let’s build and test a permutation function for the raw sorting data.

Now, let’s generate multiple permuted CSVs.

generate_n_sorting_permutations <-
  function(wp_group = "P1",
           n_permutations = 5) {
    csv_in <- paste0("analysis/data/", wp_group, "-sorting.csv")
    if (!file.exists(csv_in)) {
      stop(paste0("`csv_in` not found: ", csv_in))
    }
    
    df_in <- readr::read_csv(csv_in)
    
    df_exemplars <- df_in[, -c(1, 2, 23, 24)]
    out_m <- as.matrix(df_exemplars)
    for (p in 1:n_permutations) {
      csv_out <-
        paste0(
          "analysis/data/permutation_analysis/sorting_csv/",
          wp_group,
          "-sorting-perm-",
          stringr::str_pad(p, 3, pad = 0), ".csv"
        )
      
      for (r in 1:dim(out_m)[1]) {
        new_i <- sample(1:20)
        out_m[r, 1:20] <- out_m[r, new_i]
      }
      
      array_out <-
        as.data.frame(cbind(df_in$Participant, df_in$Set, out_m, df_in$Set_size, df_in$Group))
      
      # Rename!
      names(array_out) <-
        c("Participant",
          "Set",
          names(df_exemplars),
          "Set_size",
          "Group")
      array_out
    
      readr::write_csv(array_out, csv_out)
    }
  }

Then we test it.

generate_n_sorting_permutations()

Now, let’s confirm that we can calculate Jaccard indices from these data.

make_jaccard_csvs <- function(wallpaper_group = "P1",
                              duplicates = FALSE,
                              input_dir = 'analysis/data/permutation_analysis/sorting_csv/',
                              output_dir = 'analysis/data/permutation_analysis/jaccards/') {
  # Makes a data.frame from the raw sorting data
  
  # Load externals
  source("analysis/jaccard.data.R")
  source("analysis/jaccard.R")
  
  these_csvs <-
    list.files(input_dir, paste0("^", wallpaper_group, "\\-"), full.names = TRUE)
  purrr::map(these_csvs,
             calculate_save_jaccard_df,
             wallpaper_group,
             jaccard_dir = output_dir)
}

# Load a sorting permutation file, calculate the Jaccard indices, and (conditionally) save it to file.
calculate_save_jaccard_df <- function(this_csv,
                                      wallpaper_group,
                                      save_output = TRUE,
                                      jaccard_dir = "analysis/data/permutation_analysis/jaccards/",
                                      vb = FALSE) {
  
  this_fn <- basename(this_csv)
  this_perm_number <- stringr::str_extract(this_fn, "[0-9]{3}")
  out_fn <-
    paste0(jaccard_dir,
           wallpaper_group,
           "-jaccard-",
           this_perm_number,
           ".csv")
  
  this_df <- readr::read_csv(this_csv)
  
  # Calculate Jaccard
  jaccard_df <- jaccard.data(this_df)
  
  if (save_output) {
    if (vb) message(paste0('Saving ', out_fn))
    readr::write_csv(jaccard_df, out_fn)
  } else {
    jaccard_df
  }
}
make_jaccard_csvs()

Generate data

P1

generate_n_sorting_permutations("P1", n_permutations = 999)
make_jaccard_csvs("P1")

P31M

generate_n_sorting_permutations("P31M", n_permutations = 999)
make_jaccard_csvs("P31M")

P3M1

generate_n_sorting_permutations("P3M1", n_permutations = 999)
make_jaccard_csvs("P3M1")

P6

generate_n_sorting_permutations("P6", n_permutations = 999)
make_jaccard_csvs("P6")

P6M

generate_n_sorting_permutations("P6M", n_permutations = 999)
make_jaccard_csvs("P6M")

Analyze simulated results

Create helper functions

make_perm_jaccard_df <- function(this_csv) {
  this_fn <- basename(this_csv)
  this_perm_number <- stringr::str_extract(this_fn, "[0-9]{3}")
  this_df <- readr::read_csv(this_csv)
  
  this_df <- this_df %>%
    dplyr::mutate(
      .,
      exemplar_pair = paste0(
        stringr::str_extract(Exemplar.Row, "[0-9]{3}$"),
        "-",
        stringr::str_extract(Exemplar.Col, "[0-9]{3}$")
        ),
        perm = this_perm_number
    )
  
  this_df
}
make_aggregate_perm_jaccard_df <- function(wp_group = "P1",
                                           input_dir = "analysis/data/permutation_analysis/jaccards",
                                           save_csv = TRUE,
                                           output_dir = "analysis/data/permutation_analysis/aggregates",
                                           vb = TRUE) {
  these_csvs <-
    list.files(input_dir, paste0("^", wp_group, "\\-"), full.names = TRUE)
  df <- purrr::map_df(these_csvs, make_perm_jaccard_df)
  
  if (save_csv) {
    out_fn <- file.path(output_dir, paste0(wp_group, "-aggregate-perm-jaccard.csv"))
    if (vb) message(paste0("Saving ", out_fn))
    readr::write_csv(df, out_fn)
  } else {
    df
  }
}

Empirical distributions of Jaccard indices by group and exemplar-pair

P1

Import the data.

P1_perm_df <- make_aggregate_perm_jaccard_df("P1")
P1_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P1-aggregate-perm-jaccard.csv")

Visualize.

P1_perm_df %>%
  ggplot2::ggplot(.) +
  ggplot2::aes(x = Jaccard) +
  ggplot2::geom_histogram(bins = 50)

Generate summary stats by exemplar pair.

P1_perm_stats_df <- P1_perm_df %>%
  dplyr::group_by(., Group, exemplar_pair) %>%
  dplyr::summarize(., jaccard_mean = mean(Jaccard),
                   jaccard_sd = sd(Jaccard))

P31M

Import the data.

P31M_perm_df <- make_aggregate_perm_jaccard_df("P31M")
P31M_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P31M-aggregate-perm-jaccard.csv")

Visualize.

P31M_perm_df %>%
  ggplot2::ggplot(.) +
  ggplot2::aes(x = Jaccard) +
  ggplot2::geom_histogram(bins = 50)

Generate summary stats by exemplar pair.

P31M_perm_stats_df <- P31M_perm_df %>%
  dplyr::group_by(., Group, exemplar_pair) %>%
  dplyr::summarize(., jaccard_mean = mean(Jaccard),
                   jaccard_sd = sd(Jaccard))

P3M1

Import the data.

P3M1_perm_df <- make_aggregate_perm_jaccard_df("P3M1")
P3M1_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P3M1-aggregate-perm-jaccard.csv")

Visualize.

P3M1_perm_df %>%
  ggplot2::ggplot(.) +
  ggplot2::aes(x = Jaccard) +
  ggplot2::geom_histogram(bins = 50)

Generate summary stats by exemplar pair.

P3M1_perm_stats_df <- P3M1_perm_df %>%
  dplyr::group_by(., Group, exemplar_pair) %>%
  dplyr::summarize(., jaccard_mean = mean(Jaccard),
                   jaccard_sd = sd(Jaccard))

P6

Import the data.

P6_perm_df <- make_aggregate_perm_jaccard_df("P6")
P6_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P6-aggregate-perm-jaccard.csv")

Visualize.

P6_perm_df %>%
  ggplot2::ggplot(.) +
  ggplot2::aes(x = Jaccard) +
  ggplot2::geom_histogram(bins = 50)

Generate summary stats by exemplar pair.

P6_perm_stats_df <- P6_perm_df %>%
  dplyr::group_by(., Group, exemplar_pair) %>%
  dplyr::summarize(., jaccard_mean = mean(Jaccard),
                   jaccard_sd = sd(Jaccard))

P6M

Import the data.

P6M_perm_df <- make_aggregate_perm_jaccard_df("P6M")
P6M_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P6M-aggregate-perm-jaccard.csv")

Visualize.

P6M_perm_df %>%
  ggplot2::ggplot(.) +
  ggplot2::aes(x = Jaccard) +
  ggplot2::geom_histogram(bins = 50)

Generate summary stats by exemplar pair.

P6M_perm_stats_df <- P6M_perm_df %>%
  dplyr::group_by(., Group, exemplar_pair) %>%
  dplyr::summarize(., jaccard_mean = mean(Jaccard),
                   jaccard_sd = sd(Jaccard))

Aggregating across groups

jaccard_perm_df <- rbind(P1_perm_df, P31M_perm_df, P3M1_perm_df, P6_perm_df, P6M_perm_df)

Visualization.

jaccard_perm_df %>%
  ggplot(.) +
  aes(Jaccard, color = Group) +
  facet_grid(Group ~ .) +
  geom_boxplot(bins = 50)

jaccard_perm_df %>%
  ggplot(.) +
  aes(Jaccard, color = Group) +
  facet_grid(Group ~ .) +
  geom_boxplot(bins = 50)

jaccard_perm_df %>%
  ggplot(.) +
  aes(x = Group, y = Jaccard) +
  geom_violin()

These plots show that the mean differences in Jaccard indices are reflected in the participants’ data are shown in the permuted data, too. This makes sense since the participants detected regularities and sorted the exemplars into different numbers of sets. In permuting the exemplar identifiers within participants, we keep some of this structure.

Let’s try aggregating the by-exemplar statistics.

jaccard_perm_stats_df <- rbind(P1_perm_stats_df, P31M_perm_stats_df, P3M1_perm_stats_df, P6_perm_stats_df, P6M_perm_stats_df)

# Sort by group, exemplar_pair
jaccard_perm_stats_df <- jaccard_perm_stats_df %>%
  dplyr::arrange(Group, exemplar_pair)
jaccard_perm_stats_df %>%
  ggplot(.) +
  aes(x = jaccard_mean, fill = Group) +
  geom_histogram() +
  facet_grid(Group ~ .) + 
  ggtitle("Mean exemplar-pair Jaccard indices for permuted data")

jaccard_perm_stats_df %>%
  ggplot(.) +
  aes(x = jaccard_sd, fill = Group) +
  geom_histogram() +
  facet_grid(Group ~ .) +
  ggtitle("Standard deviation of exemplar-pair Jaccard indices for permuted data")

Compare observed Jaccard to empirical distribution

Load the observed data and clean it.

jaccard_observed_df <-
  readr::read_csv("analysis/data/jaccard-no-duplicates.csv")

jaccard_observed_df <- jaccard_observed_df %>%
  dplyr::mutate(.,
                exemplar_pair = paste0(
                  stringr::str_extract(Exemplar.Row, "[0-9]{3}$"),
                  "-",
                  stringr::str_extract(Exemplar.Col, "[0-9]{3}$")
                )) %>%
  dplyr::arrange(., Group, exemplar_pair)

Now, merge the permuted data with the observed data.

jaccard_merged_df <- dplyr::left_join(jaccard_perm_stats_df,
                                      jaccard_observed_df,
                                      by = c("Group", "exemplar_pair"))

# Rearrange columns for convenience
jaccard_merged_df <- jaccard_merged_df %>%
  dplyr::select(., Group, exemplar_pair, Jaccard, jaccard_mean, jaccard_sd, Exemplar.Row, Exemplar.Col)

# Rename variables for clarity
jaccard_merged_df <- jaccard_merged_df %>%
  dplyr::rename(., group = Group, jaccard_obs = Jaccard, 
                jaccard_emp_mean = jaccard_mean,
                jaccard_emp_sd = jaccard_sd,
                exemplar_row = Exemplar.Row,
                exemplar_col = Exemplar.Col)

Calculate empirical z as \(z_{emp}=J_{obs}-\mu_{J}\) for each exemplar pair.

jaccard_merged_df <- jaccard_merged_df %>%
  dplyr::mutate(., z_emp = (jaccard_obs-jaccard_emp_mean)/jaccard_emp_sd)

Plot a histogram of z_emp.

jaccard_merged_df %>%
  ggplot(.) +
  aes(z_emp, fill = group) +
  geom_histogram() +
  facet_grid(group ~ .)

Curiously, P31M, P6, P6M and P3M1 have exemplar pairs whose observed Jaccard indices are substantially larger than the empirically derived reference (null) distribution even though the mean Jaccard indices for P1 are the largest.

Just for fun, let’s print the exemplar pairs whose z_emp exceed 4.

jaccard_merged_df %>%
  dplyr::filter(., z_emp > 4) %>%
  dplyr::arrange(., group, desc(jaccard_emp_mean)) %>%
  knitr::kable(.)
group exemplar_pair jaccard_obs jaccard_emp_mean jaccard_emp_sd exemplar_row exemplar_col z_emp
P1 008-009 0.4347826 0.1997302 0.0523893 101008 101009 4.486650
P31M 006-016 0.3469388 0.1544913 0.0455907 115006 115016 4.221202
P31M 002-020 0.4347826 0.1541413 0.0445330 115002 115020 6.301875
P31M 004-009 0.4666667 0.1536301 0.0458028 115004 115009 6.834437
P31M 008-015 0.3469388 0.1534583 0.0454080 115008 115015 4.260934
P31M 014-020 0.5000000 0.1532609 0.0443117 115014 115020 7.824999
P31M 007-020 0.3469388 0.1530137 0.0472758 115007 115020 4.101998
P31M 002-007 0.6500000 0.1528091 0.0456546 115002 115007 10.890260
P31M 002-014 0.4347826 0.1523665 0.0464296 115002 115014 6.082672
P31M 007-014 0.3469388 0.1520057 0.0465605 115007 115014 4.186665
P31M 009-014 0.3469388 0.1501004 0.0442869 115009 115014 4.444622
P3M1 019-020 0.4042553 0.1436215 0.0475534 114019 114020 5.480869
P3M1 003-016 0.3469388 0.1431156 0.0447568 114003 114016 4.554010
P3M1 011-019 0.3750000 0.1415372 0.0445837 114011 114019 5.236508
P6 001-017 0.3200000 0.1384481 0.0436607 116001 116017 4.158245
P6 007-009 0.4666667 0.1380211 0.0441697 116007 116009 7.440517
P6 014-017 0.3200000 0.1376545 0.0451294 116014 116017 4.040509
P6 013-019 0.4888889 0.1373315 0.0433875 116013 116019 8.102725
P6 005-013 0.3333333 0.1365565 0.0442138 116005 116013 4.450579
P6 002-011 0.4042553 0.1363300 0.0431172 116002 116011 6.213881
P6 008-009 0.3541667 0.1361674 0.0434391 116008 116009 5.018501
P6 008-020 0.3265306 0.1359853 0.0425070 116008 116020 4.482680
P6 006-019 0.4666667 0.1356848 0.0441123 116006 116019 7.503171
P6 003-014 0.3265306 0.1354559 0.0438507 116003 116014 4.357394
P6 006-013 0.5581395 0.1348380 0.0450430 116006 116013 9.397711
P6M 008-010 0.3333333 0.1448769 0.0455945 117008 117010 4.133318
P6M 008-020 0.3541667 0.1437420 0.0410528 117008 117020 5.125709
P6M 001-011 0.3333333 0.1433422 0.0465964 117001 117011 4.077378
P6M 010-020 0.3829787 0.1432528 0.0453747 117010 117020 5.283256